In [64]:
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.cross_validation import train_test_split,cross_val_score,ShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.svm import SVC, LinearSVC
import seaborn as sns

In [2]:
data = pd.read_csv("data.csv")

In [3]:
data.rename(columns={'Customer name': 'CustomerName', 'Company category': 'CompanyCategory','Annual salary':'AnnualSalary','Can buy paid?':'CanBuyPaid?' }, inplace=True)

In [4]:
data['CanBuyPaid?'] = data['CanBuyPaid?'].replace(to_replace = {'Yes':1,'No':0})

In [5]:
number = LabelEncoder()
data['CustomerName'] = number.fit_transform(data.CustomerName)
data['Occupation'] = number.fit_transform(data.Occupation)
data['CompanyCategory'] = number.fit_transform(data.CompanyCategory)

In [6]:
train, test = train_test_split(data, train_size=0.7,random_state=42)

In [7]:
train_X = train.ix[:,:-1]
train_Y = train.ix[:,-1:]
test_X = test.ix[:,:-1]
test_Y = test.ix[:,-1:]

In [8]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()

In [9]:
regressor.fit(train_X, train_Y)


Out[9]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [10]:
print('Weight coefficients: ', regressor.coef_)
print('y-axis intercept: ', regressor.intercept_)


('Weight coefficients: ', array([[ -6.36064825e-03,  -4.14973963e-03,   1.03833854e-02,
          8.37297742e-02,   4.29574962e-10]]))
('y-axis intercept: ', array([ 0.54944046]))

In [11]:
preds = regressor.predict(test_X)

In [12]:
predictions = [round(value) for value in preds]

In [13]:
accuracy = accuracy_score(test_Y, predictions)

In [14]:
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Accuracy: 80.00%

In [15]:
#RMSE
from sklearn import metrics
print np.sqrt(metrics.mean_squared_error(test_Y, preds))


0.456179544644

In [16]:
print(metrics.mean_absolute_error(test_Y, preds))
print(metrics.mean_squared_error(test_Y, preds))


0.389421635669
0.208099776952

Accuracy on Train Data


In [17]:
y_pred_train = regressor.predict(train_X)
predictions = [round(value) for value in y_pred_train]
accuracy = accuracy_score(train_Y, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Accuracy: 71.43%

SVM


In [18]:
# Support Vector Machines
svc = SVC(gamma=10)
svc.fit(train_X, train_Y)
Y_pred = svc.predict(test_X)
svc.score(train_X, train_Y)


/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Out[18]:
1.0

In [19]:
#cross_val_score(SVC(C=10), train_X,train_Y, cv=3, scoring="f1")

In [20]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor

In [21]:
rfrModel = RandomForestRegressor()
rfrModel.fit(train_X,train_Y)
preds = rfrModel.predict(test_X)
predictions = [round(value) for value in preds]
accuracy = accuracy_score(test_Y, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Accuracy: 53.33%
/home/chetan/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:2: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  from ipykernel import kernelapp as app

In [22]:
plt.scatter(preds, test_Y)


Out[22]:
<matplotlib.collections.PathCollection at 0x7f984c32d710>

In [23]:
from sklearn.metrics import r2_score, mean_squared_error
# Metric 1: correlation coefficient.
r2_score(preds, test_Y)


Out[23]:
-6.3036649214659688

In [24]:
# Metric 2: mean squared error
mean_squared_error(preds, test_Y)


Out[24]:
0.24799999999999997

In [25]:
cv = ShuffleSplit(n=len(train_X), n_iter=15, test_size=0.3)

In [26]:
models = dict()
models['rf'] = RandomForestRegressor()
models['gb'] = GradientBoostingRegressor()
models['ad'] = AdaBoostRegressor()
models['ex'] = ExtraTreesRegressor()
scores = dict()

for abbr, model in models.items():
    print(abbr, model)
    score = cross_val_score(model, train_X, train_Y, cv=cv, scoring='mean_squared_error')
    scores[abbr] = -score


('ex', ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
          verbose=0, warm_start=False))
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:1665: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
('rf', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False))
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
('gb', GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False))
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
('ad', AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None))
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)

In [27]:
score = dict()
for algo, score_value in scores.iteritems():
    print algo,accuracy_score(test_Y, [round(value) for value in score_value])


ad 0.666666666667
rf 0.733333333333
ex 0.666666666667
gb 0.666666666667

In [28]:
score_summary = pd.DataFrame(scores)
score_summary = pd.DataFrame(score_summary.unstack()).reset_index()
# sns.violinplot(x=)
score_summary.columns = ['model', 'idx', 'error']
sns.violinplot(x='model', y='error', data=score_summary)


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f984c0a7990>

In [29]:
from sklearn.neighbors import KNeighborsRegressor
kneighbor_regression = KNeighborsRegressor(n_neighbors=1)
kneighbor_regression.fit(train_X, train_Y)


Out[29]:
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=1, p=2,
          weights='uniform')

In [30]:
x_knr_pred = kneighbor_regression.predict(train_X)
knr_predictions = [round(value) for value in x_knr_pred]

print("Accuracy: %.2f%%" % (accuracy_score(train_Y, knr_predictions) * 100.0))


Accuracy: 100.00%

In [31]:
y_pred_test = kneighbor_regression.predict(test_X)
y_knr_predictions = [round(value) for value in y_pred_test]

print("Accuracy: %.2f%%" % (accuracy_score(test_Y, y_knr_predictions) * 100.0))


Accuracy: 60.00%

In [32]:
from sklearn.linear_model import Ridge

In [33]:
ridge_models = {}
training_scores = []
test_scores = []

for alpha in [100, 10, 1, .01]:
    ridge = Ridge(alpha=alpha).fit(train_X, train_Y)
    training_scores.append(ridge.score(train_X, train_Y))
    test_scores.append(ridge.score(test_X, test_Y))
    ridge_models[alpha] = ridge
for train,test in zip(training_scores,test_scores):
    print train,test


0.0708517803506 -0.00540984941797
0.0837419604572 -0.0388888773424
0.08564940759 -0.0602875993609
0.0856852526793 -0.0641054022047

In [34]:
plt.figure()
plt.plot(training_scores, label="training scores")
plt.plot(test_scores, label="test scores")
plt.xticks(range(4), [100, 10, 1, .01])
plt.legend(loc="best")


Out[34]:
<matplotlib.legend.Legend at 0x7f984b2a1810>

In [35]:
ridgeRegModel = Ridge(alpha=.01).fit(train_X,train_Y)
ridgePred = ridgeRegModel.predict(test_X)
ridge_predictions = [round(value) for value in ridgePred]
print("Accuracy: %.2f%%" % (accuracy_score(test_Y, ridge_predictions) * 100.0))


Accuracy: 80.00%

In [36]:
from sklearn.learning_curve import learning_curve

def plot_learning_curve(est, X, y):
    training_set_size, train_scores, test_scores = learning_curve(est, X, y, train_sizes=np.linspace(.1, 1, 20))
    estimator_name = est.__class__.__name__
    line = plt.plot(training_set_size, train_scores.mean(axis=1), '--', label="training scores " + estimator_name)
    plt.plot(training_set_size, test_scores.mean(axis=1), '-', label="test scores " + estimator_name, c=line[0].get_color())
    plt.xlabel('Training set size')
    plt.legend(loc='best')
    plt.ylim(-0.1, 1.1)


/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/learning_curve.py:23: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the functions are moved. This module will be removed in 0.20
  DeprecationWarning)

In [37]:
plt.figure()
plot_learning_curve(LinearRegression(), train_X, train_Y)
plot_learning_curve(Ridge(alpha=10), train_X, train_Y)



In [38]:
from sklearn.linear_model import Lasso

In [39]:
lasso_models = {}
training_scores = []
test_scores = []

for alpha in [30, 10, 1, .01]:
    lasso = Lasso(alpha=alpha).fit(train_X, train_Y)
    training_scores.append(lasso.score(train_X, train_Y))
    test_scores.append(lasso.score(test_X, test_Y))
    lasso_models[alpha] = lasso
for train,test in zip(training_scores,test_scores):
    print train,test


0.00168034413579 0.00729735198793
0.00168034466195 0.00730388946765
0.0162340507226 0.0250314839281
0.084939945255 -0.040922013627

In [40]:
plt.figure()
plt.plot(training_scores, label="training scores")
plt.plot(test_scores, label="test scores")
plt.xticks(range(4), [30, 10, 1, .01])
plt.legend(loc="best")


Out[40]:
<matplotlib.legend.Legend at 0x7f984b095e10>

In [41]:
plt.figure(figsize=(10, 5))
plot_learning_curve(LinearRegression(), train_X, train_Y)
plot_learning_curve(Ridge(alpha=10), train_X, train_Y)
plot_learning_curve(Lasso(alpha=10), train_X, train_Y)



In [48]:
import xgboost as xgb

In [52]:
xgbmodel = xgb.XGBRegressor()
xgbmodel.fit(train_X,train_Y)


Out[52]:
XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [54]:
xgb_y_pred = xgbmodel.predict(test_X)
xgb_prediction = [round(value) for value in xgb_y_pred]

In [56]:
accuracy = accuracy_score(test_Y,xgb_prediction)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Accuracy: 73.33%

In [61]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, random_state=7)

In [63]:
for train_index, test_index in kf.split(train_X):
    print train_index, test_index


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-63-5014bbaad380> in <module>()
----> 1 for train_index, test_index in kf.split(train_X):
      2     print train_index, test_index

AttributeError: 'KFold' object has no attribute 'split'

In [66]:
results = cross_val_score(xgbmodel, train_X, train_Y, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-66-33caaf120d25> in <module>()
----> 1 results = cross_val_score(xgbmodel, train_X, train_Y, cv=kfold)
      2 print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

/home/chetan/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
   1569                                               train, test, verbose, None,
   1570                                               fit_params)
-> 1571                       for train, test in cv)
   1572     return np.array(scores)[:, 0]
   1573 

TypeError: 'KFold' object is not iterable

In [65]:
rng = np.random.RandomState(31337)
kf = KFold(train_Y.shape[0], n_folds=2, shuffle=True, random_state=rng)
for train_index, test_index in kf:
    xgb_model = xgb.XGBRegressor().fit(train_X[train_index],train_Y[train_index])
    predictions = xgb_model.predict(test_X[test_index])
    print 'hello'


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-65-6e7240bbd397> in <module>()
      1 rng = np.random.RandomState(31337)
----> 2 kf = KFold(train_Y.shape[0], n_folds=2, shuffle=True, random_state=rng)
      3 for train_index, test_index in kf:
      4     xgb_model = xgb.XGBRegressor().fit(train_X[train_index],train_Y[train_index])
      5     predictions = xgb_model.predict(test_X[test_index])

TypeError: __init__() got an unexpected keyword argument 'n_folds'

In [ ]:
# train_X, test_X, train_y, test_y = train_test_split(train, test, 
#                                                     train_size=0.5, 
#                                                     random_state=123)